> library(ggplot2)
> library(reshape)
> library(gplots)
> library(edgeR)
> library(CHBUtils)
> library(pheatmap)
> library(knitr)
> library(DESeq2)
> project_summary = "../project-summary.csv"
> counts_file = "../combined.counts"
> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
+ "#D55E00", "#CC79A7")
> summarydata = read.table(project_summary, header = TRUE, sep = ",")
> rownames(summarydata) = summarydata$Name
> summarydata = summarydata[order(summarydata$Name), ]
> counts = read.table(counts_file, header = TRUE, row.names = "id")
> counts = counts[, order(colnames(counts))]
> # this is a list of all non user-supplied metadata columns that could appear
> known_columns = c("Name", "X.GC", "Exonic.Rate", "Sequences.flagged.as.poor.quality",
+ "rRNA.rate", "Fragment.Length.Mean", "Intronic.Rate", "Intergenic.Rate",
+ "Mapping.Rate", "Quality.format", "Duplication.Rate.of.Mapped", "Mapped",
+ "rRNA", "Sequence.length", "Transcripts.Detected", "Mean.Per.Base.Cov.",
+ "Genes.Detected", "Unique.Starts.Per.Read", "unique_starts_per_read", "complexity")
> batch2 = read.table("../batch_2_samples.txt", sep = "\t")
> batch2$batch = 2
> m = merge(summarydata, batch2, by.x = "Name", by.y = "V1", all.x = TRUE)
> m$batch[is.na(m$batch)] = 1
> summarydata = m
> summarydata = summarydata[, !colnames(summarydata) %in% c("species", "time")]
> summarydata$tissue_status = as.factor(paste(summarydata$tissue, summarydata$status,
+ sep = "_"))
> rownames(summarydata) = summarydata$Name
> get_heatmap_fn = function(summarydata) {
+ # return the pheatmap function with or without metadata
+ metadata = summarydata[, !colnames(summarydata) %in% known_columns, drop = FALSE]
+ metadata = cbind(metadata, summarydata[, c("Mapping.Rate", "X.GC")])
+ if (ncol(metadata) == 0) {
+ return(pheatmap)
+ } else {
+ rownames(metadata) = summarydata$Name
+ heatmap_fn = function(data, ...) {
+ pheatmap(data, annotation = metadata, ...)
+ }
+ return(heatmap_fn)
+ }
+ }
> heatmap_fn = get_heatmap_fn(summarydata)
> numeric_columns = c("X.GC", "Exonic.Rate", "rRNA.rate", "Intronic.Rate", "Intergenic.Rate",
+ "Mapping.Rate", "Mapped", "Fragment.Length.Mean", "Genes.Detected", "Transcripts.Detected")
> sumdf = data.frame(summary(summarydata[, numeric_columns]))
>
>
> ## Mapped reads
> ggplot(summarydata, aes(x = Name, y = Mapped)) + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")
> ggplot(summarydata, aes(x = Name, y = Mapping.Rate)) + geom_bar(stat = "identity") +
+ ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90))
> dd = data.frame(Name = names(counts), Genes.Detected = colSums(counts > 0))
> ggplot(dd, aes(x = Name, y = Genes.Detected)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("genes detected") +
+ xlab("")
> ggplot(summarydata, aes(x = Name, y = Exonic.Rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("exonic mapping rate") +
+ xlab("")
> ggplot(summarydata, aes(x = Name, y = rRNA.rate)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("rRNA rate") +
+ xlab("")
> ggplot(summarydata, aes(x = Name, y = Fragment.Length.Mean)) + geom_bar(stat = "identity") +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("fragment length") +
+ xlab("")
> melted = melt(counts)
> colnames(melted) = c("sample", "count")
> melted$sample = factor(melted$sample)
> melted$sample = reorder(melted$sample, colnames(counts))
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
Trimmed mean of M-values (TMM) normalization is described here
Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biology, 11(3). doi:10.1186/gb-2010-11-3-r25
> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> normalized_counts = cpm(y, normalized.lib.sizes = TRUE)
> melted = melt(normalized_counts)
> colnames(melted) = c("gene", "sample", "count")
> melted$sample = factor(melted$sample)
> melted$sample = reorder(melted$sample, colnames(counts))
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
> ggplot(melted, aes(x = count, group = sample)) + geom_density() + theme_bw(base_size = 10) +
+ theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) +
+ xlab("")
> heatmap_fn(cor(normalized_counts, method = "pearson"), fontsize = 6)
GC content seems to cluster with tissue type with the head and rest of body having lower GC content than the spermatheca, MAG and atrium.
> heatmap_fn(cor(normalized_counts, method = "spearman"), fontsize = 6)
One sample, DT3_ALBI_Sp_M is very different from all of the other samples. This has the lower number of transcripts detected at 1.2k which is about 1/10th of the other samples. This sample likely has some kind of either contamination issue or PCR artifact happening, where a single gene or a small number of genes was sequenced repeatedly.
> kable(summarydata[summarydata$Name == "DT3_ALBI_Sp_M", ], format = "markdown")
| Name | X.GC | Exonic.Rate | Sequences.flagged.as.poor.quality | rRNA.rate | Fragment.Length.Mean | Intronic.Rate | Intergenic.Rate | Mapping.Rate | Quality.format | Duplication.Rate.of.Mapped | Mapped | rRNA | unique_starts_per_read | Sequence.length | Transcripts.Detected | Mean.Per.Base.Cov. | complexity | Genes.Detected | tissue | status | sex | rep | batch | tissue_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| DT3_ALBI_Sp_M | DT3_ALBI_Sp_M | 51 | 0.5814477 | 0 | 0 | 101 | 0.2427143 | 0.175766 | 0.9899027 | standard | 0 | 9214889 | 0 | NA | 25-101 | 1216 | 79.68765 | NA | 1 | spermatheca | mated | female | D | 2 | spermatheca_mated |
as opposed to the first five samples sequenced:
> kable(head(summarydata), format = "markdown")
| Name | X.GC | Exonic.Rate | Sequences.flagged.as.poor.quality | rRNA.rate | Fragment.Length.Mean | Intronic.Rate | Intergenic.Rate | Mapping.Rate | Quality.format | Duplication.Rate.of.Mapped | Mapped | rRNA | unique_starts_per_read | Sequence.length | Transcripts.Detected | Mean.Per.Base.Cov. | complexity | Genes.Detected | tissue | status | sex | rep | batch | tissue_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ACD_T3_RBm_Albi | ACD_T3_RBm_Albi | 36 | 0.3994357 | 0 | 0 | 113 | 0.1882835 | 0.4120508 | 0.5325388 | standard | 0 | 62323037 | 0 | NaN | 25-101 | 10322 | 47.74643 | NA | 1 | body | mated | male | A-C-D | 1 | body_mated |
| AT3_ALBI_Sp_M | AT3_ALBI_Sp_M | 50 | 0.5859818 | 0 | 0 | 154 | 0.1694706 | 0.2443366 | 0.5953684 | standard | 0 | 66183201 | 0 | NaN | 25-101 | 9434 | 62.89629 | NA | 1 | spermatheca | mated | female | A | 2 | spermatheca_mated |
| AT3_ALBI_Sp_V | AT3_ALBI_Sp_V | 52 | 0.6092181 | 0 | 0 | 283 | 0.1749786 | 0.2157042 | 0.6819525 | standard | 0 | 179442063 | 0 | NaN | 25-101 | 10027 | 175.74693 | NA | 1 | spermatheca | virgin | female | A | 2 | spermatheca_virgin |
| AT3_MAG_M_Albi | AT3_MAG_M_Albi | 45 | 0.5559447 | 0 | 0 | 137 | 0.2191971 | 0.2247786 | 0.8584414 | standard | 0 | 58087516 | 0 | NaN | 25-101 | 8959 | 33.30295 | NA | 1 | MAG | mated | male | A | 1 | MAG_mated |
| BCD_T3_RBv_Albi | BCD_T3_RBv_Albi | 35 | 0.3455879 | 0 | 0 | 113 | 0.2008765 | 0.4532733 | 0.5256571 | standard | 0 | 66953895 | 0 | NaN | 25-101 | 9731 | 44.50073 | NA | 1 | body | virgin | male | B-C-D | 1 | body_virgin |
| BT3_ALBI_At_M | BT3_ALBI_At_M | 45 | 0.4863401 | 0 | 0 | 178 | 0.1644467 | 0.3491574 | 0.7796990 | standard | 0 | 128394650 | 0 | NaN | 25-101 | 9823 | 137.04950 | NA | 1 | atrium | mated | female | B | 1 | atrium_mated |
We should drop DT3_ALBI_Sp_M from the analysis.
> summarydata = summarydata[!rownames(summarydata) == "DT3_ALBI_Sp_M", ]
> counts = counts[, !colnames(counts) == "DT3_ALBI_Sp_M"]
> normalized_counts = normalized_counts[, !colnames(normalized_counts) == "DT3_ALBI_Sp_M"]
The overall exonic mapping rates is low:
> summary(summarydata$Exonic.Rate)
Min. 1st Qu. Median Mean 3rd Qu. Max. 0.3103 0.4006 0.4863 0.4857 0.5889 0.6468
It isn’t clear if these issues are due to the fact the genome/annotation for the mosquito is poor or if there is a problem with the samples.
> mds(normalized_counts, k = length(colnames(normalized_counts)) - 1)
There is pretty consistent separation of the head with M/V status, and maybe some separation in MAG with M/V status, barring CT3, and an overall separation of tissue type. Interestingly, atrium and rest of body samples look pretty similar to each other.
> select = order(rowMeans(counts), decreasing = TRUE)[1:30]
> heatmap_fn(counts[select, ])
We’ll look at differences that are driven by mating status in the same tissue. The way we will do this is to create factors that describe both the tissue status and mating status, and then do pairwise comparisons across those tissues. This will answer the question of what genes are different in each specific tissue.
> library(DESeq2)
> library(DEGreport)
> library(vsn)
> design = ~sex + tissue_status
> condition = "tissue"
> counts <- counts[rowSums(counts > 0) > 1, ]
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = design)
> dds = DESeq(dds)
Now that we have the differential expression calls using the tissue and status, we can pull out the mating specific differences in each tissue.
> atrium = results(dds, contrast = list(c("tissue_statusatrium_mated"), c("tissue_statusatrium_virgin")))
> write.table(atrium, file = "atrium_deseq2.tsv", sep = "\t", row.names = TRUE,
+ col.names = TRUE, quote = FALSE)
> atrium_de = subset(atrium, padj < 0.1)
There are 1067 genes flagged as differentally expressed between the atrium of mated and virgin samples, with 370 upregulated in mated samples and 697 in atrium samples.
There is quite a bit of variability between samples:
> DESeq2::plotMA(atrium)
> atrium_stats = as.data.frame(atrium[, c(2, 6)])
> volcano_density_plot(atrium_stats, title = names(atrium_stats), lfc.cutoff = 1.5)
> head = results(dds, contrast = list(c("tissue_statushead_mated"), c("tissue_statushead_virgin")))
> write.table(head, file = "head_deseq2.tsv", sep = "\t", row.names = TRUE, col.names = TRUE,
+ quote = FALSE)
> head_de = subset(head, padj < 0.1)
There are 571 genes flagged as differentally expressed between the atrium of mated and virgin samples, with 288 upregulated in mated samples and 283 in atrium samples.
There is quite a bit of variability between samples:
> DESeq2::plotMA(head)
> head_stats = as.data.frame(head[, c(2, 6)])
> volcano_density_plot(head_stats, title = names(head_stats), lfc.cutoff = 1.5)
> MAG = results(dds, contrast = list(c("tissue_statusMAG_mated"), c("tissue_statusMAG_virgin")))
> write.table(MAG, file = "MAG_deseq2.tsv", sep = "\t", row.names = TRUE, col.names = TRUE,
+ quote = FALSE)
> MAG_de = subset(MAG, padj < 0.1)
There are 1088 genes flagged as differentally expressed between the atrium of mated and virgin samples, with 583 upregulated in mated samples and 505 in atrium samples.
There is quite a bit of variability between samples:
> DESeq2::plotMA(MAG)
> MAG_stats = as.data.frame(MAG[, c(2, 6)])
> volcano_density_plot(MAG_stats, title = names(MAG_stats), lfc.cutoff = 1.5)
> summarydata$sex_tissue_status = paste(summarydata$sex, summarydata$tissue_status,
+ sep = "_")
> design_body = ~sex_tissue_status
> dds_body = DESeqDataSetFromMatrix(countData = counts, colData = summarydata,
+ design = design_body)
> dds_body = DESeq(dds_body)
> female_body = results(dds_body, contrast = list(c("sex_tissue_statusfemale_body_mated"),
+ c("sex_tissue_statusfemale_body_virgin")))
> male_body = results(dds_body, contrast = list(c("sex_tissue_statusmale_body_mated"),
+ c("sex_tissue_statusmale_body_virgin")))
> write.table(male_body, file = "male_body_deseq2.tsv", sep = "\t", row.names = TRUE,
+ col.names = TRUE, quote = FALSE)
> write.table(female_body, file = "female_body_deseq2.tsv", sep = "\t", row.names = TRUE,
+ col.names = TRUE, quote = FALSE)
> male_body_de = subset(male_body, padj < 0.1)
> female_body_de = subset(female_body, padj < 0.1)
There are 0 genes flagged as differentally expressed between the atrium of mated and virgin samples in the male, with 0 upregulated in mated samples and 0 in the virgin samples samples.
There are 3 genes flagged as differentally expressed between the atrium of mated and virgin samples in the female, with 0 upregulated in mated samples and 3 in the virgin samples samples.
> DESeq2::plotMA(body)
> body_stats = as.data.frame(body[, c(2, 6)])
> volcano_density_plot(body_stats, title = names(body_stats), lfc.cutoff = 1.5)
> spermatheca = results(dds, contrast = list(c("tissue_statusspermatheca_mated"),
+ c("tissue_statusspermatheca_virgin")))
> write.table(spermatheca, file = "spermatheca_deseq2.tsv", sep = "\t", row.names = TRUE,
+ col.names = TRUE, quote = FALSE)
> spermatheca_de = subset(spermatheca, padj < 0.1)
There are 978 genes flagged as differentally expressed between the atrium of mated and virgin samples, with 655 upregulated in mated samples and 323 in atrium samples.
There is quite a bit of variability between samples:
> DESeq2::plotMA(spermatheca)
> spermatheca_stats = as.data.frame(spermatheca[, c(2, 6)])
> volcano_density_plot(spermatheca_stats, title = names(spermatheca_stats), lfc.cutoff = 1.5)
Samples tend to cluster on the PCA plot by tissue and to a lesser degree mated/virgin status, that is a good sign.
> rld = rlog(dds)
> plotPCA(rld, intgroup = c("tissue", "status"))
There is a huge amount of dispersion between these samples.
> par(mfrow = c(1, 3))
> notAllZero <- (rowSums(counts(dds)) > 0)
> rld <- rlog(dds)
> vsd <- varianceStabilizingTransformation(dds)
> rlogMat <- assay(rld)
> vstMat <- assay(vsd)
>
> meanSdPlot(log2(counts(dds, normalized = TRUE)[notAllZero, ] + 1), ylim = c(0,
+ 2.5))
> meanSdPlot(assay(rld[notAllZero, ]), ylim = c(0, 2.5))
> meanSdPlot(assay(vsd[notAllZero, ]), ylim = c(0, 2.5))
> plotDispEsts(dds)
One of the things we can do with this data is to produce a set of tissue-specific markers. We’re looking for a set a gene signatures that can be used to classify the different tissues.
We’ll do NMF to pull out gene signatures for the different individual tissues.
> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
+ "#D55E00", "#CC79A7", "#000000")
>
> library(edgeR)
> library(limma)
> library(NMF)
> design = ~sex + tissue_status + sex:tissue_status
> dge = DGEList(counts = counts)
> dge = calcNormFactors(y)
> voomed = voom(dge, plot = TRUE)
> # tissue_metadata = subset(summarydata, tissue != 'body')
> tissue_metadata <- summarydata
> tissue_only = normalized_counts[, colnames(normalized_counts) %in% rownames(tissue_metadata)]
> tissue_only = tissue_only[rowSums(tissue_only) > 1, ]
> adf_tissue = AnnotatedDataFrame(tissue_metadata[, c("sex", "tissue")], data.frame(labelDescription = c("sex",
+ "tissue")))
> x = ExpressionSet(as.matrix(tissue_only), adf_tissue)
> nbasis <- length(unique(summarydata$tissue))
> n = nmf(x, nbasis, nrun = 100, .options = list(parallel = FALSE))
> consensus = consensusmap(n, annCol = x)
> basismap(n, scale = "r1", annColors = list(basis = cbPalette[4:8], consensus = brewer.pal(4,
+ "Spectral")), main = "Metagene Components - All Contributing Genes", Rowv = TRUE)
> s = featureScore(n)
> summary(s)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.001792 0.075890 0.175600 0.269500 0.403700 1.000000
> s = extractFeatures(n)
> str(s)
List of 5
$ : int [1:41] 7694 3839 2517 785 3200 5900 1379 5426 6787 5979 ...
$ : int [1:173] 9616 2207 1013 9743 9617 2221 11340 8458 5887 4557 ...
$ : int [1:63] 513 2351 2713 2716 4356 4358 4360 4362 6550 2714 ...
$ : int [1:91] 1690 6958 510 7882 508 7554 6957 509 2836 6660 ...
$ : int [1:186] 604 1224 8524 9150 9592 8195 1718 3629 6050 5377 ...
- attr(*, "method")= chr "kim"
> z = featureScore(n)
>
>
> numfeatures <- sapply(seq(0, 1, 0.01), function(num) {
+ lapply(extractFeatures(n, num), function(x) {
+ length(x)
+ })
+ })
> minnumfeatures = 20
> rel.basis.contrib.cutoff <- max(seq(0, 1, 0.01)[apply(numfeatures, 2, function(x) all(x >
+ minnumfeatures))])
>
> basismap(n, scale = "r1", subsetRow = rel.basis.contrib.cutoff, annColors = list(basis = cbPalette[4:8],
+ consensus = brewer.pal(4, "Spectral")), main = "Metagene Components - Most Specific Genes")
> coefmap(n, scale = "c1", labCol = tissue_metadata$tissue, annColors = list(basis = cbPalette[4:8]))
> # write out the metagene features
>
> fs = featureScore(n)
> features = extractFeatures(n, 0.9)
> head_features = fs[features[[5]]]
> head_features[is.na(head_features)] = 1
> head_counts = tissue_only[names(head_features), colnames(tissue_only) %in% subset(summarydata,
+ tissue == "head")$Name]
> head_scores = head_features * log(rowMeans(head_counts))
> head_plot = sort(head_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(head_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")
> ggsave("head_metagene_plot.pdf")
> write.table(head_scores, file = "head_metagene_scores.txt", quote = FALSE, col.names = FALSE)
>
>
> atrium_features = fs[features[[1]]]
> atrium_features[is.na(atrium_features)] = 1
> atrium_counts = tissue_only[names(atrium_features), colnames(tissue_only) %in%
+ subset(summarydata, tissue == "atrium")$Name]
> atrium_scores = atrium_features * log(rowMeans(atrium_counts))
> atrium_plot = sort(atrium_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(atrium_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")
> ggsave("atrium_metagene_plot.pdf")
> write.table(atrium_scores, file = "atrium_metagene_scores.txt", quote = FALSE,
+ col.names = FALSE)
>
>
> spermatheca_features = fs[features[[4]]]
> spermatheca_features[is.na(spermatheca_features)] = 1
> spermatheca_counts = tissue_only[names(spermatheca_features), colnames(tissue_only) %in%
+ subset(summarydata, tissue == "spermatheca")$Name]
> spermatheca_scores = spermatheca_features * log(rowMeans(spermatheca_counts))
> spermatheca_plot = sort(spermatheca_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(spermatheca_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")
> ggsave("spermatheca_metagene_plot.pdf")
> write.table(spermatheca_scores, file = "spermatheca_metagene_scores.txt", quote = FALSE,
+ col.names = FALSE)
>
> MAG_features = fs[features[[3]]]
> MAG_features[is.na(MAG_features)] = 1
> MAG_counts = tissue_only[names(MAG_features), colnames(tissue_only) %in% subset(summarydata,
+ tissue == "MAG")$Name]
> MAG_scores = MAG_features * log(rowMeans(MAG_counts))
> MAG_plot = sort(MAG_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(MAG_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")
> ggsave("MAG_metagene_plot.pdf")
> write.table(MAG_scores, file = "MAG_metagene_scores.txt", quote = FALSE, col.names = FALSE)
>
> body_features = fs[features[[2]]]
> body_features[is.na(body_features)] = 1
> body_counts = tissue_only[names(body_features), colnames(tissue_only) %in% subset(summarydata,
+ tissue == "body")$Name]
> body_scores = body_features * log(rowMeans(body_counts))
> body_plot = sort(body_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(body_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() +
+ theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5,
+ color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")
> ggsave("body_metagene_plot.pdf")
> write.table(body_scores, file = "body_metagene_scores.txt", quote = FALSE, col.names = FALSE)
We can see that the metagenes separate out the different body parts for the most part, except for a single body sample which has a signature more similar to the atrium than the body samples. Could that sample have both atrium and the rest of body in it?
Why are 422 and 423 missing? It looks like they are only expressed in MAG, and are expressed at an extremely high level in MAG.
> missing_genes = c("AALB000422", "AALB000423")
> counts[missing_genes, ]
ACD_T3_RBm_Albi AT3_ALBI_Sp_M AT3_ALBI_Sp_V AT3_MAG_M_Albi
AALB000422 0 12 65 874156
AALB000423 0 0 12 317583
BCD_T3_RBv_Albi BT3_ALBI_At_M BT3_ALBI_At_V BT3_ALBI_H_M
AALB000422 1 0 0 0
AALB000423 5 0 0 0
BT3_ALBI_H_V BT3_ALBI_RB_M BT3_ALBI_RB_V BT3_ALBI_Sp_M
AALB000422 0 0 0 47
AALB000423 0 0 0 9
BT3_ALBI_Sp_V BT3_MAG_V_Albi CT3_ALBI_At_M CT3_ALBI_At_V
AALB000422 0 307223 0 0
AALB000423 0 40297 0 0
CT3_ALBI_H_M CT3_ALBI_H_V CT3_ALBI_RB_M CT3_ALBI_RB_V
AALB000422 0 0 0 0
AALB000423 0 0 0 0
CT3_MAG_M_Albi CT3_MAG_V_Albi DT3_ALBI_At_M DT3_ALBI_At_V
AALB000422 1107671 594214 1 1
AALB000423 177929 52428 2 0
DT3_ALBI_H_M DT3_ALBI_H_V DT3_ALBI_RB_M DT3_ALBI_RB_V
AALB000422 0 0 0 0
AALB000423 0 0 0 0
DT3_ALBI_Sp_V DT3_MAG_M_Albi DT3_MAG_V_Albi
AALB000422 0 1781563 736322
AALB000423 0 465594 108771
> MAG[missing_genes, ]
log2 fold change (MAP): tissue_statusMAG_mated vs tissue_statusMAG_virgin
Wald test p-value: tissue_statusMAG_mated vs tissue_statusMAG_virgin
DataFrame with 2 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
AALB000422 240320.86 1.291841 1.128429 1.144813 NA
AALB000423 53559.16 2.234039 1.445440 1.545577 0.1222067
padj
<numeric>
AALB000422 NA
AALB000423 0.4165237
We can see there is a high SE for the log fold change, which is why these are not being called significant, it isn’t that these genes aren’t expressed.
> old_counts = read.table("/Volumes/Clotho/Users/rory/cache/mosquito_project/data/albimanus/combined.counts",
+ header = TRUE, sep = "\t", row.names = 1)
> colnames(old_counts) = c("AT3_MAG_M_Albi", "BT3_ALBI_At_M", "BT3_ALBI_At_V",
+ "BT3_MAG_V_Albi", "CT3_ALBI_At_M", "CT3_ALBI_At_V", "CT3_MAG_M_Albi", "CT3_MAG_V_Albi",
+ "DT3_ALBI_At_M", "DT3_ALBI_At_V", "DT3_MAG_M_Albi", "DT3_MAG_V_Albi")
> old_MAG = read.table("/Volumes/Clotho/Users/rory/cache/mosquito_project/scripts/albimanus_MAG.tsv",
+ sep = "\t", header = TRUE)
> library(reshape)
> melted_counts = melt(as.matrix(counts))
> colnames(melted_counts) = c("gene", "sample", "count")
> melted_counts$run = "new"
>
> old_melted_counts = melt(as.matrix(old_counts))
> colnames(old_melted_counts) = c("gene", "sample", "count")
> old_melted_counts$run = "old"
> head(old_melted_counts)
gene sample count run
1 AALB014361 AT3_MAG_M_Albi 22 old
2 AALB003697 AT3_MAG_M_Albi 45 old
3 AALB003698 AT3_MAG_M_Albi 427 old
4 AALB014362 AT3_MAG_M_Albi 39 old
5 AALB014363 AT3_MAG_M_Albi 0 old
6 AALB003699 AT3_MAG_M_Albi 16 old
> in_both = rbind(melted_counts[melted_counts$sample %in% old_melted_counts$sample,
+ ], old_melted_counts)
> mcompare = merge(melted_counts, old_melted_counts, by.x = c("gene", "sample"),
+ by.y = c("gene", "sample"), all = FALSE)
We can see below that in general we have more reads aligning to each gene with the new analysis. This is likely to be partially due improvements in the aligner and improvements in the gene models.
> ggplot(mcompare, aes(count.x, count.y)) + geom_point() + xlab("new") + ylab("old") +
+ scale_y_sqrt() + scale_x_sqrt()
Looking at the old code that just had the atrium and MAG to compare, we did the same thing where we built the model using all of the samples. We used limma last time to call differential expression instead of DESeq2 which is another source of difference; we could do the same thing again for this sample and see if it is a DESeq2 vs limma difference or not.
> library(edgeR)
> design = model.matrix(~0 + tissue_status, data = summarydata)
> colnames(design) = gsub("tissue_status", "", colnames(design))
> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> v = voom(y, design, plot = TRUE)
> fit_limma = lmFit(v, design)
> cm = makeContrasts(atrium = atrium_mated - atrium_virgin, MAG = MAG_mated -
+ MAG_virgin, head = head_mated - head_virgin, spermatheca = spermatheca_mated -
+ spermatheca_virgin, levels = design)
> fit_limma2 = contrasts.fit(fit_limma, cm)
> fit_limma2 = eBayes(fit_limma2)
> MAG_results_limma = topTable(fit_limma2, n = Inf, p.value = 1, coef = "MAG")
> MAG_results_limma[missing_genes, ]
logFC AveExpr t P.Value adj.P.Val B
AALB000422 1.380099 -0.2388494 2.090200 0.047002900 0.52990333 -4.1047694
AALB000423 2.439860 -0.9987175 3.700783 0.001072736 0.06534383 -0.7726864
ALB000422 gets knocked out of being significant if you use limma instead of DESeq2, but AALB000423 doesn’t. If we just do what we did before, include only the atirum and the MAG results, they are both significant.
> library(edgeR)
> summarysubset = subset(summarydata, tissue %in% c("atrium", "MAG"))
> countssubset = counts[, colnames(counts) %in% summarysubset$Name]
> summarysubset$tissue_status = as.character(summarysubset$tissue_status)
> summarysubset$tissue = as.character(summarysubset$tissue)
> summarysubset$status = as.character(summarysubset$status)
> design = model.matrix(~0 + tissue_status, data = summarysubset)
> colnames(design) = gsub("tissue_status", "", colnames(design))
> y = DGEList(counts = countssubset)
> y = calcNormFactors(y)
> v = voom(y, design, plot = FALSE)
> fit_limma_subset = lmFit(v, design)
> cm = makeContrasts(atrium = atrium_mated - atrium_virgin, MAG = MAG_mated -
+ MAG_virgin, levels = design)
> fit_limma_subset2 = contrasts.fit(fit_limma_subset, cm)
> fit_limma_subset2 = eBayes(fit_limma_subset2)
> MAG_results_limma_subset = topTable(fit_limma_subset2, n = Inf, p.value = 1,
+ coef = "MAG")
> MAG_results_limma_subset[missing_genes, ]
logFC AveExpr t P.Value adj.P.Val B
AALB000422 1.417859 5.467304 4.159149 0.0015387210 0.07765818 -1.013592
AALB000423 2.472445 4.127719 5.662333 0.0001376023 0.02262663 1.358130
However quantile normalizing the voom counts also has an effect:
> library(edgeR)
> summarysubset = subset(summarydata, tissue %in% c("atrium", "MAG"))
> countssubset = counts[, colnames(counts) %in% summarysubset$Name]
> summarysubset$tissue_status = as.character(summarysubset$tissue_status)
> summarysubset$tissue = as.character(summarysubset$tissue)
> summarysubset$status = as.character(summarysubset$status)
> design = model.matrix(~0 + tissue_status, data = summarysubset)
> colnames(design) = gsub("tissue_status", "", colnames(design))
> y = DGEList(counts = countssubset)
> y = calcNormFactors(y)
> v = voom(y, design, plot = FALSE, normalize = "quantile")
> fit_limma_subset_quantile = lmFit(v, design)
> cm = makeContrasts(atrium = atrium_mated - atrium_virgin, MAG = MAG_mated -
+ MAG_virgin, levels = design)
> fit_limma_subset_quantile2 = contrasts.fit(fit_limma_subset_quantile, cm)
> fit_limma_subset_quantile2 = eBayes(fit_limma_subset_quantile2)
> MAG_results_limma_subset_quantile = topTable(fit_limma_subset_quantile2, n = Inf,
+ p.value = 1, coef = "MAG")
> MAG_results_limma_subset_quantile[missing_genes, ]
logFC AveExpr t P.Value adj.P.Val B
AALB000422 0.1981414 4.956242 0.5290292 0.609346864 0.9530702 -6.701848
AALB000423 1.1355503 3.925534 3.3622285 0.008137943 0.1683863 -2.862057
So there are a set of genes that are pretty sensitive to how we do the analysis; if we swap callers, or how we combine them together or how we do the normalization, it matters. These are a set of genes that are not robust changes. We can make reasonable arguments for doing any of these.
> MAG <- MAG[order(rownames(MAG)), ]
> MAG_results_limma <- MAG_results_limma[order(rownames(MAG_results_limma)), ]
> MAG_results_limma_subset <- MAG_results_limma_subset[order(rownames(MAG_results_limma_subset)),
+ ]
> MAG_results_limma_subset_quantile <- MAG_results_limma_subset_quantile[order(rownames(MAG_results_limma_subset_quantile)),
+ ]
The actual fold changes between doing an analysis on the reduced dataset vs. the full one are not much different.
> qplot(MAG_results_limma$logFC, MAG_results_limma_subset$logFC) + xlab("full model") +
+ ylab("reduced model") + theme_bw() + theme(text = element_text(family = "Gill Sans",
+ size = 10), strip.background = element_rect(fill = "white"))
> qplot((MAG_results_limma$logFC + MAG_results_limma_subset$logFC)/2, MAG_results_limma$logFC -
+ MAG_results_limma_subset$logFC, geom = "hex") + xlab("average logFC") +
+ ylab("full logFC - reduced logFC") + theme_bw() + theme(text = element_text(family = "Gill Sans",
+ size = 10), strip.background = element_rect(fill = "white")) + ggtitle("bland-altman plot")
But the calculated FDR is very different:
> qplot(MAG_results_limma$adj.P.Val, MAG_results_limma_subset$adj.P.Val) + xlab("full model") +
+ ylab("reduced model")
> qplot((MAG_results_limma$logFC + MAG_results_limma_subset$logFC)/2, MAG_results_limma$logFC -
+ MAG_results_limma_subset$logFC) + xlab("average logFC") + ylab("full logFC - reduced logFC")
> qplot((MAG_results_limma$P.Value + MAG_results_limma_subset$P.Value)/2, MAG_results_limma$P.Value -
+ MAG_results_limma_subset$P.Value) + xlab("average pvalue") + ylab("full pvalue - reduced pvalue")
> qplot((fit_limma$sigma + fit_limma_subset$sigma)/2, fit_limma$sigma - fit_limma_subset$sigma) +
+ stat_binhex() + scale_x_sqrt() + xlab("average variance") + ylab("full - reduced variance")
> in_either = unique(c(rownames(subset(MAG_results_limma, adj.P.Val < 0.1)), rownames(subset(MAG_results_limma_subset,
+ adj.P.Val < 0.1))))
>
> signew = MAG_results_limma[in_either, ]
> signew$run = "full"
> signew$sig = signew$adj.P.Val < 0.1
> sigold = MAG_results_limma_subset[in_either, ]
> sigold$run = "subset"
> sigold$sig = sigold$adj.P.Val < 0.1
>
> m = merge(signew, sigold, by = "row.names")
> m$both = ifelse(m$sig.x & m$sig.y, "both", ifelse(m$sig.x, "full only", "subset only"))
> m$minpval <- pmin(m$adj.P.Val.x, m$adj.P.Val.y)
> ggplot(m, aes((AveExpr.x + AveExpr.y)/2, (AveExpr.x - AveExpr.y), size = minpval)) +
+ geom_point(alpha = 0.6) + xlab("average expression") + ylab("expression difference") +
+ facet_wrap(~both) + scale_size(range = c(0.3, 3)) + element_blank() + guides(colour = guide_legend(override.aes = list(alpha = 1),
+ title = "found in"))
> allsig = rbind(signew, sigold)
> allsig$sig = allsig$adj.P.Val < 0.1
> allsig$id = rownames(allsig)
> library(dplyr)
>
> ggplot(allsig, aes(AveExpr, logFC, shape = run, color = sig)) + geom_point()
Write out the counts and subsetted summardata file for use with limma.
> write.table(file = "limma.counts", counts, col.names = TRUE, row.names = TRUE,
+ quote = FALSE, sep = "\t")
> write.table(file = "limma.summarydata.txt", summarydata, col.names = TRUE, row.names = TRUE,
+ quote = FALSE, sep = "\t")
These are some helper functions we defined to do a GO ontology of the results of a DESeq2 results dataframe.
> library(biomaRt)
> library(GOstats)
> library(GSEABase)
> mart = useMart(biomart = "vb_gene_mart_1512", host = "biomart.vectorbase.org")
> albimanus = useDataset(mart, dataset = "aalbimanus_eg_gene")
> bm = getBM(mart = albimanus, attributes = c("ensembl_gene_id", "transcript_biotype",
+ "go_name_1006", "go_accession", "go_namespace_1003", "external_gene_id",
+ "go_linkage_type"))
> goframeData = bm[, c("go_accession", "go_linkage_type", "ensembl_gene_id")]
> colnames(goframeData) = c("frame.go_id", "frame.Evidence", "frame.gene_id")
> goframeData = subset(goframeData, frame.Evidence != "")
> goFrame = GOFrame(goframeData)
> goAllFrame = GOAllFrame(goFrame)
> gsc = GeneSetCollection(goAllFrame, setType = GOCollection())
>
> run_go = function(universe, genes, gsc) {
+ # given a list of gene ids of the gene universe and the enriched genes and a
+ # genesetcollection of ontology terms, find enrichment for molecular
+ # function, biological process and cellular component
+ params = GSEAGOHyperGParams(name = "foo", geneSetCollection = gsc, geneIds = genes,
+ universeGeneIds = universe, ontology = "MF", pvalueCutoff = 0.1, conditional = FALSE,
+ testDirection = "over")
+ over = hyperGTest(params)
+ mf = summary(over)
+ colnames(mf)[1] = "GOID"
+ params = GSEAGOHyperGParams(name = "foo", geneSetCollection = gsc, geneIds = genes,
+ universeGeneIds = universe, ontology = "BP", pvalueCutoff = 0.1, conditional = FALSE,
+ testDirection = "over")
+ over = hyperGTest(params)
+ bp = summary(over)
+ colnames(bp)[1] = "GOID"
+ params = GSEAGOHyperGParams(name = "foo", geneSetCollection = gsc, geneIds = genes,
+ universeGeneIds = universe, ontology = "CC", pvalueCutoff = 0.1, conditional = FALSE,
+ testDirection = "over")
+ over = hyperGTest(params)
+ cc = summary(over)
+ colnames(cc)[1] = "GOID"
+ df = data.frame()
+ if (nrow(mf) > 0) {
+ mf$ontology = "MF"
+ df = rbind(df, mf)
+ }
+ if (nrow(cc) > 0) {
+ cc$ontology = "CC"
+ df = rbind(df, cc)
+ }
+ if (nrow(bp) > 0) {
+ bp$ontology = "BP"
+ df = rbind(df, bp)
+ }
+ return(df)
+ }
>
> deseq_go = function(deseqres, gsc) {
+ # run GO ontology analysis using GOstats from a DESeq dataframe and a gene
+ # set collection
+ universe = rownames(subset(deseqres, baseMean > 10))
+ genes = rownames(subset(deseqres, baseMean > 10 & padj < 0.1))
+ return(run_go(universe, genes, gsc))
+ }
Here we looked at GO term enrichment for differentially expressed genes in mated compared to virgin tissues within each tissue.
This performs a hypergeometric test for enriched genes against a background set. The background set of genes are all genes expressed in the tissue. We defined expressed as having a baseMean expression > 10. The enriched genes are defined as genes with a baseMean expression > 10 and an adjusted p-value for differential expression with mated/virgin status of < 0.1. We saved these as atrium_go.tsv, head_go.tsv, etc.
> atrium_go = deseq_go(atrium, gsc)
> write.table(atrium_go, file = "atrium_go.tsv", sep = "\t", row.names = FALSE,
+ col.names = TRUE, quote = FALSE)
> MAG_go = deseq_go(MAG, gsc)
> write.table(MAG_go, file = "MAG_go.tsv", sep = "\t", row.names = FALSE, col.names = TRUE,
+ quote = FALSE)
> head_go = deseq_go(head, gsc)
> write.table(head_go, file = "head_go.tsv", sep = "\t", row.names = FALSE, col.names = TRUE,
+ quote = FALSE)
> spermatheca_go = deseq_go(spermatheca, gsc)
> write.table(spermatheca_go, file = "spermatheca_go.tsv", sep = "\t", row.names = FALSE,
+ col.names = TRUE, quote = FALSE)